bcdata = read_csv("bcdata_Assignment1.csv") |> janitor::clean_names()
data_type = str(bcdata) # all variables are numeric.
## spc_tbl_ [116 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:116] 48 83 82 68 86 49 89 76 73 75 ...
## $ bmi : num [1:116] 23.5 20.7 23.1 21.4 21.1 ...
## $ glucose : num [1:116] 70 92 91 77 92 92 77 118 97 83 ...
## $ insulin : num [1:116] 2.71 3.12 4.5 3.23 3.55 ...
## $ homa : num [1:116] 0.467 0.707 1.01 0.613 0.805 ...
## $ leptin : num [1:116] 8.81 8.84 17.94 9.88 6.7 ...
## $ adiponectin : num [1:116] 9.7 5.43 22.43 7.17 4.82 ...
## $ resistin : num [1:116] 8 4.06 9.28 12.77 10.58 ...
## $ mcp_1 : num [1:116] 417 469 555 928 774 ...
## $ classification: num [1:116] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. BMI = col_double(),
## .. Glucose = col_double(),
## .. Insulin = col_double(),
## .. HOMA = col_double(),
## .. Leptin = col_double(),
## .. Adiponectin = col_double(),
## .. Resistin = col_double(),
## .. MCP.1 = col_double(),
## .. Classification = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary = skimr::skim(bcdata)# There's no missing in this dataset.
summary
| Name | bcdata |
| Number of rows | 116 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 57.30 | 16.11 | 24.00 | 45.00 | 56.00 | 71.00 | 89.00 | ▃▇▅▇▃ |
| bmi | 0 | 1 | 27.58 | 5.02 | 18.37 | 22.97 | 27.66 | 31.24 | 38.58 | ▅▆▇▅▃ |
| glucose | 0 | 1 | 97.79 | 22.53 | 60.00 | 85.75 | 92.00 | 102.00 | 201.00 | ▅▇▁▁▁ |
| insulin | 0 | 1 | 10.01 | 10.07 | 2.43 | 4.36 | 5.92 | 11.19 | 58.46 | ▇▁▁▁▁ |
| homa | 0 | 1 | 2.69 | 3.64 | 0.47 | 0.92 | 1.38 | 2.86 | 25.05 | ▇▁▁▁▁ |
| leptin | 0 | 1 | 26.62 | 19.18 | 4.31 | 12.31 | 20.27 | 37.38 | 90.28 | ▇▃▂▁▁ |
| adiponectin | 0 | 1 | 10.18 | 6.84 | 1.66 | 5.47 | 8.35 | 11.82 | 38.04 | ▇▅▂▁▁ |
| resistin | 0 | 1 | 14.73 | 12.39 | 3.21 | 6.88 | 10.83 | 17.76 | 82.10 | ▇▂▁▁▁ |
| mcp_1 | 0 | 1 | 534.65 | 345.91 | 45.84 | 269.98 | 471.32 | 700.08 | 1698.44 | ▇▇▃▁▁ |
| classification | 0 | 1 | 1.55 | 0.50 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00 | ▆▁▁▁▇ |
bcdata = bcdata |> mutate(who_bmi =
ifelse(bmi < 16.5, "Severely underweight",
ifelse(16.5 <= bmi & bmi < 18.5, "Underweight",
ifelse(18.5 <= bmi & bmi < 25, "Normal weight",
ifelse(25 <= bmi & bmi < 30, "Overweight",
ifelse(30 <= bmi & bmi < 35, "Obesity class I",
ifelse(35 <= bmi & bmi < 40, "Obesity class II","Obesity class III"))))))) |>
mutate(classification = recode(classification, "1" = "Healthy Controls", "2" = "Breast Cancer Patients")) |> arrange(bmi)
#check if I have recoded bmi correctly
table(bcdata$bmi, bcdata$who_bmi)
#Since there's no healthy controls in underweight category, I added a category "underweight healthhy controls" with 0 count to make sure every column has the same width.
freq_table = bcdata |> group_by(who_bmi, classification) |>
summarise(n = n()) |> mutate(proportion = n / sum(n) * 100)
supp = data.frame(who_bmi = "Underweight",
classification = "Healthy Controls",
n = 0, proportion = 0)
final_freq = bind_rows(supp, freq_table) |> mutate(who_bmi = as.factor(who_bmi))
level = c("Severely underweight", "Underweight", "Normal weight", "Overweight", "Obesity class I", "Obesity class II", "Obesity class III")
final_freq |>
mutate(who_bmi = forcats::fct_relevel(who_bmi, level),
text_label = str_c(proportion, "%")) |>
plot_ly(x = ~who_bmi, y = ~proportion, color = ~classification, type = "bar", colors = "viridis", text = ~text_label)
But honestly, I think a barchart showing porportion of the whole sample within each category.(each category doesn’t accounted for 1) is more informative.